R-Version: [Default] [32-bit] C:\Program Files\R\R-4.1.0
packages <- c("tidyverse", "data.table", "lubridate", "ggplot2", "ggthemes", "recommenderlab", "knitr")
# Noch nicht installierte Pakete installieren
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Laden der Packete
invisible(lapply(packages, library, character.only = TRUE))
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ----------------------------------------------------------------------------------------------------------- tidyverse 1.3.1 --
v ggplot2 3.3.5 v purrr 0.3.4
v tibble 3.1.3 v dplyr 1.0.7
v tidyr 1.1.3 v stringr 1.4.0
v readr 2.0.0 v forcats 0.5.1
-- Conflicts -------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.14.0 using 4 threads (see ?getDTthreads). Latest news: r-datatable.com
Attaching package: ‘data.table’
The following objects are masked from ‘package:dplyr’:
between, first, last
The following object is masked from ‘package:purrr’:
transpose
Attaching package: ‘lubridate’
The following objects are masked from ‘package:data.table’:
hour, isoweek, mday, minute, month, quarter, second, wday, week, yday, year
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
Loading required package: arules
Attaching package: ‘arules’
The following object is masked from ‘package:dplyr’:
recode
The following objects are masked from ‘package:base’:
abbreviate, write
Loading required package: proxy
Attaching package: ‘proxy’
The following object is masked from ‘package:Matrix’:
as.matrix
The following objects are masked from ‘package:stats’:
as.dist, dist
The following object is masked from ‘package:base’:
as.matrix
Loading required package: registry
# summaries zu "TRUE" setzen um summaries anzuzeigen
summaries = TRUE
data(MovieLense)
MovieLense
943 x 1664 rating matrix of class ‘realRatingMatrix’ with 99392 ratings.
movies <- as(MovieLense, "data.frame")
movies <- movies %>% mutate_if(is.character, as.factor)
head(movies)
NA
movies_wider <- pivot_wider(
movies,
id_cols = user,
names_from = item,
values_from = rating,
values_fill = NULL,
)
head(movies_wider)
df_1 <- movies %>% group_by(item) %>% summarize(mean_rating = mean(rating)) %>% sample_n(15) %>% arrange(desc(mean_rating))
ggplot(df_1, aes(y = reorder(item, +mean_rating), x = mean_rating)) +
geom_col(alpha = 1, fill = 'steelblue') +
scale_y_discrete(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
geom_text(aes(label=round(mean_rating,2)), hjust = 1.3, color = 'white') +
labs(
title = "Durchschnittliche Filmbewertung",
subtitle = "Zufällige Stichprobe von 15 Filmen",
y = element_blank(), x = "Dirchschnittlich Bewertung in Sternen"
) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
text = element_text(size = 12) # text size
)
movies_genre <- MovieLenseMeta %>%
rename(item = title)
movies_genre$url <- NULL
movies_genre[movies_genre == 0] <- NA
a <- which(movies_genre==1,arr.ind=TRUE)
movies_genre[a] <- names(movies_genre)[a[,"col"]]
movies_genre <- movies_genre %>%
unite("genres", unknown:Western, sep= ",",
remove = TRUE, na.rm = TRUE)
genres<-merge(x=movies,y=movies_genre,by="item",all.x=TRUE)%>%
mutate(genres = strsplit(as.character(genres), ",")) %>%
unnest(genres)
df1a <- movies%>%
group_by(item)%>%
summarize(count=n())%>%
ungroup()%>%
arrange(desc(count))
df1a <- head(df1a, 10)
df1a %>%
mutate(item = fct_reorder(item, count))%>%
ggplot(aes(x = count, y = item))+
geom_col(alpha = 1, fill = 'steelblue')+
scale_y_discrete(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
geom_text(aes(label=round(count,2)), hjust = 1.3, color = 'white') +
labs(
title = "Meist bewertete Filme",
y = element_blank(), x = "Anzahl Bewertungen"
) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
text = element_text(size = 12) # text size
)
Da in unserem Datensatz nur die Anzahl Ratings von Filmen gegeben ist, gehen wir davon aus, dass die meist bewerteten, auch die am meist geschauten Filme sind. In der Grafik sieht man die 10 meist bewerteten Filme.
df1b <- genres%>%
group_by(genres)%>%
summarize(count=n())%>%
ungroup()%>%
arrange(desc(count))
df1b%>%
mutate(genres = fct_reorder(genres, count))%>%
ggplot(aes(x = count, y = genres))+
geom_col(alpha = 1, fill = 'steelblue')+
scale_y_discrete(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
geom_text(aes(label=count,2), hjust = 1.3, color = 'white') +
labs(
title = "Meist bewertete Genres",
y = element_blank(), x = "Anzahl Bewertungen"
) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
text = element_text(size = 12) # text size
)
Auch hier wird davon ausgegangen, dass die enres, welche am häufigsten bewertet wurden auch am häufigst geschaut wurden. In der Grafik ist zu sehen, dass Drama das top Genres ist, gefolgt von Comedy und Action.
ggplot(movies, aes(x = rating)) +
geom_bar(alpha = 1, fill = 'steelblue') +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
labs(
title = "Verteilung Kundenratings gesamthaft",
subtitle = paste("N = ", nrow(movies), " Bewertungen"),
x = "Kundenbewertungen",
y = "Anzahl",
fill = element_blank()
) +
theme_classic() +
theme(
text = element_text(size = 12)
)
In dieser Grafik ist die Verteilung der bewertungen zu sehen. Die Bewertungen 4 und 5 wirden klar am häufigsten vergeben, wobei 1 und 2 eher selten bewertet werden.
# get rating count per user, add as column for further processing
counts <- movies %>% group_by(user) %>% count()
movies <- merge(movies, counts, by="user")
movies_wider <- merge(movies_wider, counts, by="user")
# avoid users with almost no ratings, use median as threshold
median_count <- median(counts$n)
print(median_count)
[1] 64
# get sample
set.seed(623)
movies_sample <- movies_wider %>% filter(n > median_count) %>% sample_n(5)
# create long table
movies_sample_long <- filter(movies, user %in% movies_sample$user)
# drop item names,
movies_sample_long <- subset(movies_sample_long, select = -c(item))
df2b <- genres%>%
group_by(genres)
movies_sample_long_grouped <- movies_sample_long %>% group_by(user, rating) %>% summarise(rating_dens = length(user) / first(n), user = first(user), n=first(n), rating = first(rating))
`summarise()` has grouped output by 'user'. You can override using the `.groups` argument.
ggplot(genres, aes(x = rating, fill = genres)) +
geom_bar(alpha = 1, bins = 10) +
facet_wrap(~genres)+
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
labs(
title = "Verteilung Kundenratings nach Genres",
subtitle = paste("N = ", nrow(movies), " Bewertungen"),
x = "Durchschnittliche Bewertung",
y = "Anzahl",
fill = element_blank()
) +
theme(
text = element_text(size = 12),
legend.position = 'none'
)
Warning: Ignoring unknown parameters: bins
Hier ist zu sehen, dass das Genres Drama am meisten bewertet wurde, wobei Dokumentationen am wenigsten Bewertungen erhalten haben. Die Bewertungen pro Genres verteilen sich jeweils sehr ähnlich. Die Verteilungen der einzelnen Genres sind ebenfalls ähnlich verteilt wie die bewertungen gesamthaft.
df3 <- movies %>%
group_by(item) %>%
summarize(
mean_rating = mean(rating),
ratings = n()
) %>%
mutate(
more_than_50 = ifelse(ratings >= 50, 'b) mehr als 50 Bewertungen', 'a) weniger als 50 Bewertugen')
)
ggplot(df3, aes(x = mean_rating)) +
geom_density(alpha = 1, fill = 'steelblue', bw = 0.08) +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
labs(
title = "Verteilung mittlere Kundenratings pro Film",
subtitle = paste("N = ", nrow(df3), " Filme"),
x = "Durchschnittliche Bewertung",
y = "Dichte"
) +
theme_classic() +
theme(text = element_text(size = 12)
)
In dieser Grafik ist die durchschnittliche Bewertung pro Film zu sehen, wobei auch hier zu sehen ist ,dass die die meisten Filme eine Durchschnittliche Bewertung von ca. 3 - 3.5 haben.
ggplot(df3, aes(x = mean_rating, fill = more_than_50)) +
geom_density(alpha = 0.5, bw = 0.08) +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
labs(
title = "Verteilung mittlere Kundenratings pro Film",
subtitle = "N = 1664 Filme",
x = "Durchschnittliche Bewertung",
y = "Dichte",
fill = element_blank()
) +
theme_classic() +
theme(
text = element_text(size = 12),
legend.position = 'bottom'
)
Für diese Grafik wurden die Filme in zwei gruppen unterteilt: Filme die weniger als 50 bewertungen erhalten haben, und Filme welche mehr als 50 Bewertungen erhalten haben. In der Grafik ist imernoch die durchschnittliche Bewertung dieser Filme zu sehen wobei deutlich erkannt werden kann, dass filme welche weniger bewertungen erhalten haben, tendenziell auch schlechter bewertet wurden.
# Number of ratings per user per rating value
movies_sample_long_grouped <- movies_sample_long %>% group_by(user, rating) %>% summarise(rating_dens = length(user) / first(n), user = first(user), n=first(n), rating = first(rating))
`summarise()` has grouped output by 'user'. You can override using the `.groups` argument.
movies_sample_long_grouped
movies_sample_long
ggplot(movies_sample_long_grouped, aes(x=rating, y = rating_dens, fill=user)) +
geom_col(position=position_dodge()) +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
labs(
title = "Streuung Kundenbewertungen für zufällig gewählte Kunden",
subtitle = "N = 5 Kunden",
x = "User Bewertung (1-5)",
y = "Ausprägung Rating",
fill = element_blank()
) +
scale_fill_manual("legend", values = c("cyan3", "cyan4", "darkolivegreen3", "darkolivegreen", "coral4")
)+
theme_classic() +
theme(
text = element_text(size = 12),
legend.position = 'bottom'
)
In dieser Grafik sehen wir, wie sich die Bewertungen einzelner Kunden verteilen. Auffallend ist generell, dass die Bewertungen 1 und 2 weniger oft abgegeben wurde als 3 und 4. Bei der Verteilung der ratings sind von User zu User Unterschiede feststellbar. User 24 bewertet beispielsweise viel besser als User 639. Dies könnte bedeuten, dass User 24 nur Filme bewertet oder schaut die er/sie mag, oder grundsätzlich höhere Bewertungen abgibt. Leider sehen wir hier weniger gut, welche Tendenzen die Streuung der Rating aller User aufweisen.
movies_span <- movies %>% group_by(user) %>%
summarize(mean = mean(rating), min = min(rating), max = max(rating), span = (max(rating) - min(rating)))
movies_span
set.seed(123)
ggplot(sample_n(movies_span, 20), aes(x=user)) +
geom_point(colour="black", aes(y=mean), shape=21) +
geom_errorbar(aes(ymin=min, ymax=max)) +
labs(
title = "Spannweite Kundenratings ",
subtitle = "N = 20 Kunden",
x = "User ID",
y = "Rating Range"
)+
theme_classic() +
theme(
text = element_text(size = 12),
legend.position = 'bottom'
)
ggplot(movies_span, aes(x=user)) +
geom_bar(colour="black", aes(span)) +
labs(
title = "Spannweite Kundenratings",
subtitle = "",
x = "Spannweite",
y = "Anzahl User"
)+
theme_classic() +
theme(
text = element_text(size = 12),
legend.position = 'bottom'
)
NA
0In diesen Grafiken sehen wir detailliertere Informationen über die Spannweite und den Mittelpunkt. In der ersten Übersicht ist die Spannweite und der Mittelpunkt einzelner Kunden dargestellt. Es fällt auf, dass trotz des teilweise relativ hohem Mittelwert alle Ratings von 1-5 abgegeben wurden. Ein rating von 5 wurde sozusagen immer abgegeben, 1 nicht immer. In der zweiten Übersicht ist die Spannweite aller Kunden dargestellt. Hier wird sichtbar, dass die meisten Kunden Bewertungen von 1-5 abgegeben haben (Spannweite=4), und nur weinige sehr homogen bewertet haben (Spannweite = 1-2). Eine kleine Spannweite kann hier auch aufgetreten sein, da diese User sehr wenige Bewertungen abgegeben haben.
hist(getRatings(MovieLense),
breaks=15,
main = "Verteilung der Bewertungen")
#hist(getRatings(MovieLenseNorm), breaks=40)
Die Ratings sind nun ungefähr Normalverteilt mit einem Durchschnittsrating von 0 und einer Standardabweichung von 1. Erkennbar ist, dass die Verteilung rechtssteil und linksschief ist, also mehrheitlich positive Bewertungen abgegeben wurden. Durch die Normierung der Daten werden die Ratings jedes Users auf dieselbe Verteilung gestaucht, wodurch man die Verteilung aller Daten analysieren kann. Dadurch hat man beispielsweise die Möglichkeit die durchschnittliche Bewertungstendenz herauszufinden.
image(MovieLense, main = "Raw Ratings")
MovieLenseNorm <- normalize(MovieLense, method="Z-score")
image(MovieLenseNorm, main = "Normalized Ratings")
Users mit tiefen ID’s und Filme mit hohen ID’s weisen weniger ratings auf. Filme mit tiefer ID jedoch sehr viele. Auffallend ist, dass es einige wenige User gibt, die fast alle Filme bewertet haben (erkennbar durch die horizontalen scharzen Striche). Dies scheinen sehr aktive Bewerter zu sein. Viele Users haben jedoch nur einen kleinen Teil der Filme bewertet. Bei den Filmen ist eine ähnliche Tendenz wahrzunehmen, jedoch sind die vertikalen Striche breiter. Möglicherweise sind dort einige beliebte Filme zusammengefasst.
# convert into df
data <- as(MovieLense, "data.frame")
# get the 400 users with most ratings
counts <- data %>% group_by(user) %>% count() %>% arrange(desc(n), user) %>% head(400)
data <- inner_join(counts, data, by="user")
data <- data %>% select(user, item, rating) %>% ungroup
data <- as.data.frame(data)
# get the 700 Movies with most ratings
counts <- data %>% group_by(item) %>% count() %>% arrange(desc(n), item) %>% head(700)
data <- inner_join(counts, data, by="item")
data <- data %>% select(user, item, rating) %>% ungroup
data <- as.data.frame(data)
# convert back into realRatingMatrix
ratingMatrix <- as(data, "realRatingMatrix")
ratingMatrix
400 x 700 rating matrix of class ‘realRatingMatrix’ with 67765 ratings.
get_sparsity <- function(Matrix) {
round(( 1 - (nratings(Matrix) / (dim(Matrix)[1] * dim(Matrix)[2]))) * 100,2)
}
show_sparsity <- function(Matrix, Name) {
Measurement <- list('Matrix','Dimension', 'Sparsity', 'Density')
Value <- list(Name, paste('(',toString(dim(Matrix)), ')'),paste(get_sparsity(Matrix), '%' ), paste(100 - get_sparsity(Matrix), '%' ))
df <- cbind(Measurement,Value)
head(df)
}
show_sparsity_change <- function(oldMatrix, newMatrix) {
print(list(show_sparsity(oldMatrix, 'Old Matrix'), show_sparsity(newMatrix, 'New Matrix')))
}
show_sparsity_change(MovieLense, ratingMatrix)
[[1]]
Measurement Value
[1,] "Matrix" "Old Matrix"
[2,] "Dimension" "( 943, 1664 )"
[3,] "Sparsity" "93.67 %"
[4,] "Density" "6.33 %"
[[2]]
Measurement Value
[1,] "Matrix" "New Matrix"
[2,] "Dimension" "( 400, 700 )"
[3,] "Sparsity" "75.8 %"
[4,] "Density" "24.2 %"
old_matrix <- as(MovieLense, "data.frame") %>%
group_by(item) %>%
summarize(
mean_rating = mean(rating),
ratings = n()
) %>%
mutate(
matrix = 'a) alte Matrix'
)
new_matrix <- as(ratingMatrix, "data.frame") %>%
group_by(item) %>%
summarize(
mean_rating = mean(rating),
ratings = n()
) %>%
mutate(
matrix = 'b) neue Matrix'
)
comparison <- bind_rows(old_matrix, new_matrix)
ggplot(comparison, aes(x = mean_rating, fill = matrix)) +
geom_density(alpha = 0.5, bw = 0.08) +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
labs(
title = "Verteilung mittlere Kundenratings pro Film",
subtitle = "N = 1664 Filme",
x = "Durchschnittliche Bewertung",
y = "Dichte",
fill = element_blank()
) +
theme_classic() +
theme(
text = element_text(size = 12),
legend.position = c(.90, .95)
)
image(ratingMatrix, main = "Raw Ratings")
#split <- rowCount(ratingMatrix) * 0.75
# train <- ratingMatrix[1:300]
# test <- ratingMatrix[301:400]
# train-test split
set.seed(42)
data <- as(ratingMatrix, "data.frame")
df <- data %>% group_by(user) %>% summarize(mean_rating = mean(rating))
df <- sample_frac(df, size = 0.8, replace = FALSE)
df_train <- semi_join(data,df,by='user')
df_test <- anti_join(data,df_train,by='user')
train <- as(df_train, "realRatingMatrix")
test <- as(df_test, 'realRatingMatrix')
dim(train)
[1] 320 700
dim(test)
[1] 80 700
rec <- Recommender(train, method = "IBCF", param=list(method="Cosine", k=30, normalize = NULL, na_as_zero = TRUE)) #normalize = 'center'
rec
Recommender of type ‘IBCF’ for ‘realRatingMatrix’
learned using 320 users.
# predict top 10 movies for 100 users
pre <- predict(rec, test, n = 10)
pre
Recommendations as ‘topNList’ with n = 10 for 80 users.
reco_list <- as(pre, "list")
# top 10 recommendations for the 13th user in reco_list
reco_list[13]
$`254`
[1] "Belle de jour (1967)" "Three Colors: Red (1994)" "Unbearable Lightness of Being, The (1988)"
[4] "Wings of Desire (1987)" "Piano, The (1993)" "Jackal, The (1997)"
[7] "Eve's Bayou (1997)" "Devil's Advocate, The (1997)" "Cat on a Hot Tin Roof (1958)"
[10] "Charade (1963)"
#image(as(pre, "matrix"))
model <- getModel(rec)
colSum <- colSums(model$sim > 0)
df <- as.data.frame(colSum)
# add index column
df <- cbind(item = rownames(df), df)
rownames(df) <- 1:nrow(df)
ggplot(df, aes(x = colSum)) +
geom_density(alpha = 1, fill = 'steelblue', bw = 4) +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
labs(
title = "Verteilung der Anzahl ähnlicher Filme",
# subtitle = paste("N = ", nrow(df3), " Filme"),
x = "Häufigkeit zu der der Film als Nachbar auftaucht",
y = "Häufigkeit"
) +
theme_classic() +
theme(text = element_text(size = 12)
)
df1 <- df %>% arrange(desc(colSum)) %>% head(10)
df1
ggplot(df1, aes(x = colSum, y = reorder(item, +colSum)))+
geom_col(alpha = 1, fill = 'steelblue')+
scale_y_discrete(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
geom_text(aes(label=round(colSum,2)), hjust = 1.3, color = 'white') +
labs(
title = "Häufigste Filme in Cosine-Ähnlichkeitsmatrix",
y = element_blank(),
x = "Anzahl Filme in deren Nachbarschaft der Film ist"
) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
text = element_text(size = 12) # text size
)
top10 <- as.list(df1)$item
data <- as(ratingMatrix, "data.frame")
data1 <- data %>%
group_by(item) %>%
summarize(mean_rating = mean(rating)) %>%
arrange(desc(mean_rating)) %>%
mutate(category = ifelse(item %in% top10, 'Häufigste 10 Filme', 'Restliche Filme'))
ggplot(data1, aes(x = mean_rating, fill = category)) +
geom_density(alpha = 0.5, bw = 0.05) +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0)) +
labs(
title = "Verteilung mittlere Kundenratings pro Film",
x = "Durchschnittliche Bewertung",
y = "Dichte",
fill = element_blank()
) +
theme_classic() +
theme(
text = element_text(size = 12),
legend.position = c(.14, .93)
)
show_precision <- function(listOfDifferentN, ratingMatrix, threshold) {
# normalize the rating matrix
ratingMatrix <- normalize(ratingMatrix, method="Z-score", row=TRUE)
# create a training set and a test set with true positives for recall and precision
data <- as(ratingMatrix, "data.frame")
relevant <- data %>% group_by(user) %>% sample_n(30)
true_positives <- relevant %>% filter(rating >= threshold)
false_positives <- relevant %>% filter(rating < threshold)
# remove testing observations from training set
train <- anti_join(data, relevant,by=c('user','item'))
train <- as(train, 'realRatingMatrix')
# train model based on training set
rec <- Recommender(train, method = "IBCF", param=list(method="Cosine", k=30, normalize = NULL, na_as_zero = TRUE)) #normalize = 'center', 'Z-score'
for (N in listOfDifferentN) {
# predict top N movies
pre <- predict(rec, train, n = N)
reco_list <- as(pre, "list")
recommendations <- as.data.frame(reco_list)
# find true positives and false positives for all users and add them up
true_total <- 0
false_total <- 0
for (i in as.list(unique(true_positives['user']))$user) {
our_user <- paste('X', i, sep = '')
recommendations['item'] <- recommendations[our_user]
true_total <- true_total + nrow(inner_join(recommendations['item'], true_positives %>% filter(user == as.integer(i)), by = 'item'))
false_total <- false_total + nrow(inner_join(recommendations['item'], false_positives %>% filter(user == as.integer(i)), by = 'item'))
}
# print Summary
print(paste('N =', N))
print(paste('Number of True Positives:',true_total))
print(paste('Number of False Positives:',false_total))
print(paste('Precision:',true_total / (true_total + false_total)))
print('')
}
}
show_precision(c(5,10,15,20,25,30), ratingMatrix, 0)
[1] "N = 5"
[1] "Number of True Positives: 68"
[1] "Number of False Positives: 24"
[1] "Precision: 0.739130434782609"
[1] ""
[1] "N = 10"
[1] "Number of True Positives: 162"
[1] "Number of False Positives: 45"
[1] "Precision: 0.782608695652174"
[1] ""
[1] "N = 15"
[1] "Number of True Positives: 274"
[1] "Number of False Positives: 66"
[1] "Precision: 0.805882352941176"
[1] ""
[1] "N = 20"
[1] "Number of True Positives: 403"
[1] "Number of False Positives: 105"
[1] "Precision: 0.793307086614173"
[1] ""
[1] "N = 25"
[1] "Number of True Positives: 515"
[1] "Number of False Positives: 135"
[1] "Precision: 0.792307692307692"
[1] ""
[1] "N = 30"
[1] "Number of True Positives: 632"
[1] "Number of False Positives: 177"
[1] "Precision: 0.781211372064277"
[1] ""
rec <- Recommender(train, method = "IBCF", param=list(method="Cosine", k=30, normalize = NULL, na_as_zero = TRUE)) #normalize = 'center', 'Z-score'
show_coverage <- function(listOfDifferentN, recommender) {
for (N in listOfDifferentN) {
# predict top N movies
pre <- predict(rec, train, n = N)
reco_list <- as(pre, "list")
recommendations <- as.data.frame(reco_list)
all_recommendations <- list()
# find true positives and false positives for all users and add them up
for (i in colnames(recommendations)) {
all_recommendations <- append(all_recommendations, dplyr::pull(recommendations[i]))
}
# print summary
print(paste('N =', N))
print(paste0(paste('Coverage over Item-space: ', round(length(unique(all_recommendations)) / dim(train)[2] * 100, digits = 2)),'%'))
print('')
}
}
show_coverage(c(5,10,15,20,25,30), rec)
[1] "N = 5"
[1] "Coverage over Item-space: 63.71%"
[1] ""
[1] "N = 10"
[1] "Coverage over Item-space: 83.14%"
[1] ""
[1] "N = 15"
[1] "Coverage over Item-space: 92.29%"
[1] ""
[1] "N = 20"
[1] "Coverage over Item-space: 96%"
[1] ""
[1] "N = 25"
[1] "Coverage over Item-space: 97.71%"
[1] ""
[1] "N = 30"
[1] "Coverage over Item-space: 99.14%"
[1] ""
image(as(rec@model$sim, "realRatingMatrix"))
#similarity
#image(as(similarity(train, method = "Cosine", which = "items"), "matrix"))
# plotSimilarityMatrix(train, y = NULL, clusLabels = NULL, colX = NULL, colY = NULL, myLegend = NULL, fileName = "posteriorSimilarityMatrix", savePNG = FALSE, semiSupervised = FALSE, showObsNames = FALSE, clr = FALSE, clc = FALSE, plotWidth = 500, plotHeight = 450)
cosine_sim <- function(A, B)
{
similarity <- A %*% B / (norm(A, type="2") * norm(B, type="2"))
return(similarity)
}
jaccard_sim <- function(A, B)
{
inter = length(intersect(A, B))
union = length(A) + length(B) - inter
jac = inter / union
return (jac)
}
A <- c(5, 3, 2, 1)
B <- c(1, 2, 3, 4)
cosine_sim(A, B)
[,1]
[1,] 0.6139406
jaccard_sim(A, B)
[1] 0.6
#library(lsa)
#cosine(A, B)
similarity <- as.matrix(rec@model$sim)
dim(similarity)
[1] 700 700
wide_matrix <- as.matrix(subset(movies_wider, select = -c(user)))
# replace nas with 0 (no adjusted cosine similarity)
wide_matrix[is.na(wide_matrix)] <- 0
# ibcf, because columns are taken here
# row count
len <- dim(wide_matrix)[2]
res <- diag(len)
for(i in 1:len)
{
for(j in 1:len)
{
if(i < j & i != j)
{
res[i,j] <- cosine_sim(wide_matrix[,i], wide_matrix[,j])
res[j,i] <- res[i,j]
}
}
}
res[1:10, 1:10]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1.0000000 0.40238218 0.3302448 0.45493792 0.28671351 0.11634398 0.6209786 0.48111389 0.4962884 0.27393511
[2,] 0.4023822 1.00000000 0.2730692 0.50257077 0.31883618 0.08356281 0.3834034 0.33700186 0.2552520 0.17108221
[3,] 0.3302448 0.27306918 1.0000000 0.32486639 0.21295656 0.10672227 0.3729207 0.20079389 0.2736693 0.15810426
[4,] 0.4549379 0.50257077 0.3248664 1.00000000 0.33423948 0.09030829 0.4892828 0.49023553 0.4190436 0.25256072
[5,] 0.2867135 0.31883618 0.2129566 0.33423948 1.00000000 0.03729866 0.3347686 0.25916097 0.2724484 0.05545322
[6,] 0.1163440 0.08356281 0.1067223 0.09030829 0.03729866 1.00000000 0.1396166 0.08387647 0.1510645 0.20309700
[7,] 0.6209786 0.38340339 0.3729207 0.48928280 0.33476858 0.13961658 1.0000000 0.42351452 0.5274623 0.31862281
[8,] 0.4811139 0.33700186 0.2007939 0.49023553 0.25916097 0.08387647 0.4235145 1.00000000 0.4244289 0.26776402
[9,] 0.4962884 0.25525203 0.2736693 0.41904357 0.27244840 0.15106449 0.5274623 0.42442894 1.0000000 0.28851441
[10,] 0.2739351 0.17108221 0.1581043 0.25256072 0.05545322 0.20309700 0.3186228 0.26776402 0.2885144 1.00000000
#dim(wide_matrix)
#cosine_sim(wide_matrix[,1], wide_matrix[,2])
#wide_matrix[2,1]
#as.matrix(subset(movies_wider, select = -c(user)))[,2]
df_res <- as.data.frame(res)
df_res